"""
Code to get the optimized model for each label.
This is obtained by running a grid search optimizing the accuracy and F1 score
"""
import pandas as pd
from scipy.sparse import load_npz
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

scoring = {'F1': 'f1', 'Accuracy': make_scorer(accuracy_score)}
# Initializing the pipeline
pipe = Pipeline(steps=[('estimator', MultinomialNB())])

# Add a dict of estimator and estimator related parameters in this list
params_grid = [{
                'estimator': [MultinomialNB()]
                },
               {
                'estimator': [RandomForestClassifier()],
                'estimator__n_estimators': [100, 500, 1000]
                },
               {
                'estimator': [GradientBoostingClassifier()],
                'estimator__n_estimators': [100, 500, 1000]
                },
               {
                'estimator': [LogisticRegression(solver='lbfgs', max_iter=2000)],
                'estimator__C': [1e2, 1e3, 1e4, 1e5]
                }]


X = load_npz('../data_clean/vectorized_text.npz')
y_all = pd.read_csv(('../data_clean/labels.csv'))


label_list = ['S6_is_formal', 'S6_is_legal', 'S6_is_technical',
                'S6_is_aggressive', 'S8_dummy_Activities', 'S8_dummy_Budget',
                'S8_dummy_Evaluation', 'S8_dummy_ExternalContracts',
                'S8_dummy_InstStruc', 'S8_dummy_Other', 'S8_dummy_Regulatory',
                'S9_dummy_Academic/Scholarly', 'S9_dummy_Commercial',
                'S9_dummy_Impossible to say', 'S9_dummy_Monitoring',
                'S9_dummy_Personal', 'S10_is_clear',
                'S10_is_competency_of_institution', 'S10_is_public',
                'S10_is_existant', 'S11_dummy_Date',
                'S11_dummy_Document', 'S11_dummy_Institution',
                'S11_dummy_Organization', 'S11_dummy_Person', 'S11_dummy_Place']

for label in label_list:
    # grid search for normal classifiers
    grid = GridSearchCV(pipe, params_grid, scoring=scoring,
                        n_jobs=5, refit='Accuracy')
    X_train, X_test, y_train, y_test = train_test_split(
                    X, y_all[label].values, test_size=0.20, random_state=42)
    # grid search for SMOTE version
    sm = SMOTE(random_state=42)
    x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    grid.fit(x_train_res, y_train_res)
    print(label)
    print(pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_F1',
          'rank_test_Accuracy'], ascending=True).head(1).params.item())
    print(pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_F1',
          'rank_test_Accuracy'], ascending=True).head(1).mean_test_F1.item())
    print(pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_F1',
          'rank_test_Accuracy'], ascending=True).head(1).mean_test_Accuracy.item())
